Used libraries

library(tidyverse)
library(knitr)
df <- read_csv("C:/Users/admin/Downloads/Drive Data/train_data.csv")

Train data dimensions:

dim(df[2:18])
## [1] 1000000      17

Train data variable summary

summary(df[2:18]) %>%
  kable()
id y amount_current_loan term credit_score loan_purpose yearly_income home_ownership bankruptcies years_current_job monthly_debt years_credit_history months_since_last_delinquent open_accounts credit_problems credit_balance max_open_credit
Min. : 1 Min. :0.0 Min. : 10802 Length:1000000 Length:1000000 Length:1000000 Min. : 76627 Length:1000000 Min. :0.0000 Min. : 0.00 Min. : 0 Min. : 4.0 Min. : 0.0 Min. : 0.00 Min. : 0.0000 Min. : 0 Min. :0.000e+00
1st Qu.: 250001 1st Qu.:0.0 1st Qu.:174394 Class :character Class :character Class :character 1st Qu.: 825797 Class :character 1st Qu.:0.0000 1st Qu.: 3.00 1st Qu.: 10324 1st Qu.:13.0 1st Qu.: 16.0 1st Qu.: 8.00 1st Qu.: 0.0000 1st Qu.: 113392 1st Qu.:2.700e+05
Median : 500001 Median :0.5 Median :269676 Mode :character Mode :character Mode :character Median : 1148550 Mode :character Median :0.0000 Median : 6.00 Median : 16319 Median :17.0 Median : 32.0 Median :10.00 Median : 0.0000 Median : 210539 Median :4.600e+05
Mean : 500001 Mean :0.5 Mean :316659 NA NA NA Mean : 1344805 NA Mean :0.1192 Mean : 5.88 Mean : 18550 Mean :18.1 Mean : 34.9 Mean :11.18 Mean : 0.1762 Mean : 293847 Mean :7.367e+05
3rd Qu.: 750000 3rd Qu.:1.0 3rd Qu.:435160 NA NA NA 3rd Qu.: 1605899 NA 3rd Qu.:0.0000 3rd Qu.:10.00 3rd Qu.: 24059 3rd Qu.:22.0 3rd Qu.: 51.0 3rd Qu.:14.00 3rd Qu.: 0.0000 3rd Qu.: 367422 3rd Qu.:7.674e+05
Max. :1000000 Max. :1.0 Max. :789250 NA NA NA Max. :165557393 NA Max. :7.0000 Max. :10.00 Max. :435843 Max. :70.0 Max. :176.0 Max. :76.00 Max. :15.0000 Max. :32878968 Max. :1.540e+09
NA NA NA NA NA NA NA’s :219439 NA NA’s :1805 NA’s :45949 NA NA NA’s :529539 NA NA NA NA’s :27
df$loan_purpose <- as.factor(df$loan_purpose)
df$y <- as.factor(df$y)

Summary of character variable - Loan purpose

df %>%
  group_by(loan_purpose) %>%
  summarise(n = n())  %>%
  arrange(desc(n)) %>%
  kable()
loan_purpose n
debt_consolidation 785428
other 91481
home_improvements 57517
business_loan 17756
buy_a_car 11855
medical_bills 11521
buy_house 6897
take_a_trip 5632
major_purchase 3727
small_business 3242
moving 1548
vacation 1166
wedding 1129
educational_expenses 992
renewable_energy 109
df %>%
  group_by(y, loan_purpose) %>%
  summarise(n = n()) %>%
  ggplot(aes(fill=y, y=n, x=loan_purpose)) + 
  geom_bar(position="dodge", stat="identity") + 
  coord_flip() +
  scale_y_continuous(labels = scales::comma) +
  theme_dark()

Main reasons for taking out a loan:

df %>%
  filter(y == 1) %>%
  group_by(loan_purpose) %>%
  summarise(n = n()) %>%
  arrange(desc(n)) %>%
  head(10) %>%
  kable()
loan_purpose n
debt_consolidation 391875
other 44888
home_improvements 27274
business_loan 10356
medical_bills 6286
buy_a_car 5810
buy_house 3652
take_a_trip 2870
small_business 2152
major_purchase 2120

The number of missing values in each column

na_count <- colSums(is.na(df), na.rm = TRUE)
na_count[na_count > 0]
##                 credit_score                yearly_income 
##                       314333                       219439 
##                 bankruptcies            years_current_job 
##                         1805                        45949 
## months_since_last_delinquent              max_open_credit 
##                       529539                           27

Graphs about Loan purpose for further analysis

library(DT)
df %>%
  group_by(y, loan_purpose) %>%
  summarise(n = n()) %>%
  datatable()
library(plotly)
df %>%
  group_by(y, credit_score) %>%
  summarise(n = n()) %>%
  plot_ly(x = ~credit_score, y = ~n, name = ~y, type = "bar")